1 Executive Summary

Insert a concise (max 200 word) exectutive summary. It should be a clear, interesting summary of main insights from the report.

2 Exploring the Dataset

library(tidyverse)
library(skimr) 
library(naniar) 
library(kableExtra) 
library(lubridate)
library(plotly)
library(janitor)
library(scales)
raw = read_csv("MVC.csv") %>% clean_names()
# raw %>% 
#   select(vehicle_type_code_1, vehicle_type_code_2, vehicle_type_code_3, vehicle_type_code_4, vehicle_type_code_5) %>%
#   vis_miss(warn_large_data = FALSE)
raw_1 = select(raw, -c(vehicle_type_code_3, vehicle_type_code_4, vehicle_type_code_5))  
# raw_1 %>% 
#  select(contributing_factor_vehicle_1, contributing_factor_vehicle_2, contributing_factor_vehicle_3, contributing_factor_vehicle_4, contributing_factor_vehicle_5) %>% 
#  vis_miss(warn_large_data = FALSE)
raw_2 = select(raw_1, -c(contributing_factor_vehicle_3, contributing_factor_vehicle_4, contributing_factor_vehicle_5))  
data_1 = raw_2[1:10]
data_2 = raw_2[11:23]
# data_1 %>% vis_miss(warn_large_data = FALSE)
data_1 = data_1 %>% select(-c(off_street_name))
# data_2 %>% vis_miss(warn_large_data = FALSE)
data = cbind(data_1, data_2) %>% drop_na()
data = data %>% mutate(`hour` = hour(crash_time))
# raw %>% filter(!is.na(contributing_factor_vehicle_4)) %>% filter(!is.na(contributing_factor_vehicle_5)) %>% 
#  filter(!is.na(contributing_factor_vehicle_1)) %>% 
#  filter(!is.na(contributing_factor_vehicle_2)) %>% 
#  filter(!is.na(contributing_factor_vehicle_3))
raw = raw %>% mutate(`hour` = hour(crash_time))
z1 = raw %>% group_by(hour) %>% summarise(count = n()) %>% mutate(percentage = percent((count/sum(count)), accuracy = 0.01)) %>% as.data.frame()
p = ggplot()+
  geom_bar(data=z1, aes(x=hour,y=count),
          stat="identity",position = "dodge", fill = "#093146")+
  geom_point(data=z1,aes(x=hour,y=count),  colour = "red") +
  geom_line(data=z1,aes(x=hour,y=count, group = 1), colour = "red")+
  theme_classic()+
  labs(x = "Hour", y = "Count")

ggplotly(p)
group_raw = raw %>% drop_na(borough) %>% group_by(hour, borough) %>%
  summarise(count = n())
tt = group_raw %>% 
  ggplot(aes(x= hour, y = count, color= borough)) +
  geom_line() +
  facet_wrap(.~borough, scales = "free_y")
ggplotly(tt)
data = data %>% 
  mutate(crash_date = mdy(`crash_date`))
data = data %>% 
  mutate(month = month(crash_date))
g = data %>% group_by(month)%>% summarise(count = n()) %>% 
  ggplot(aes(x = month, y = count)) +
  geom_line()+
  theme_classic()

ggplotly(g)
raw = raw %>% 
  mutate(crash_date = mdy(`crash_date`))
raw = raw %>% 
  mutate(weekday = weekdays(crash_date))

raw$weekday = raw$weekday %>% 
  ordered(levels=c("Monday", "Tuesday", "Wednesday", "Thursday", 
"Friday", "Saturday", "Sunday"))

z2 = raw %>% group_by(weekday) %>% summarise(count = n()) %>% 
  mutate(percentage = percent((count/sum(count)), accuracy = 0.01)) %>% 
  as.data.frame()
pppp = ggplot()+
   geom_bar(data=z2, aes(x=weekday,y=count),
          stat="identity",position = "dodge", fill = "#093146")+
  geom_point(data=z2,aes(x=weekday,y=count),  colour = "red") +
  geom_line(data=z2,aes(x=weekday,y=count, group = 1), colour = "red")+
  theme_classic()+
  labs(x = "Hour", y = "Count")

ggplotly(pppp)
group_raw = raw %>% drop_na(borough) %>% group_by(weekday, borough) %>%
  summarise(count = n())
tt = ggplot()+
  geom_bar(data=group_raw, aes(x= weekday, y = count, color= borough), stat="identity",position = "dodge", fill = "#093146") +
  geom_histogram() +  
  geom_point(data=group_raw,aes(x=weekday,y=count),  colour = "red") +
  geom_line(data=group_raw,aes(x=weekday,y=count, group = 1), colour = "red")+
 
  facet_wrap(.~borough, scales = "free_y")
ggplotly(tt)
raw = raw %>% 
  mutate(month = month(crash_date))
raw %>% select(hour, month) %>% table() %>% as.data.frame() %>% 
 ggplot() +
  aes(x=month, y=hour, fill=Freq) %>%
  geom_tile()+scale_fill_gradient(low="white", high="darkblue")

raw %>% select(hour, weekday) %>% table() %>% as.data.frame() %>% 
 ggplot() +
  aes(x=weekday, y=hour, fill=Freq) %>%
  geom_tile()+scale_fill_gradient(low="white", high="darkblue")

raw %>% glimpse()
## Rows: 1,615,800
## Columns: 32
## $ crash_date                    <date> 2019-06-20, 2019-07-09, 2019-06-28, 201~
## $ crash_time                    <time> 02:37:00, 18:00:00, 21:00:00, 12:00:00,~
## $ borough                       <chr> "BROOKLYN", NA, "QUEENS", NA, "BROOKLYN"~
## $ zip_code                      <dbl> 11223, NA, 11418, NA, 11236, 10019, 1003~
## $ latitude                      <dbl> 40.59792, NA, 40.70757, 40.85667, 40.634~
## $ longitude                     <dbl> -73.96117, NA, -73.83620, -73.86569, -73~
## $ location                      <chr> "POINT (-73.961174 40.597916)", NA, "POI~
## $ on_street_name                <chr> NA, "NORTHERN BOULEVARD", "METROPOLITAN ~
## $ cross_street_name             <chr> NA, "CLEARVIEW EXPRESSWAY", "116 STREET"~
## $ off_street_name               <chr> "2410      CONEY ISLAND AVENUE", NA, NA,~
## $ number_of_persons_injured     <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0~
## $ number_of_persons_killed      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ number_of_pedestrians_injured <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0~
## $ number_of_pedestrians_killed  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ number_of_cyclist_injured     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ number_of_cyclist_killed      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ number_of_motorist_injured    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0~
## $ number_of_motorist_killed     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ contributing_factor_vehicle_1 <chr> "Unspecified", "Following Too Closely", ~
## $ contributing_factor_vehicle_2 <chr> "Unspecified", "Passing or Lane Usage Im~
## $ contributing_factor_vehicle_3 <chr> NA, NA, "Unspecified", NA, NA, NA, NA, N~
## $ contributing_factor_vehicle_4 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
## $ contributing_factor_vehicle_5 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
## $ collision_id                  <dbl> 4155032, 4167185, 4161958, 4170995, 4155~
## $ vehicle_type_code_1           <chr> "Station Wagon/Sport Utility Vehicle", "~
## $ vehicle_type_code_2           <chr> "Station Wagon/Sport Utility Vehicle", "~
## $ vehicle_type_code_3           <chr> NA, NA, "Sedan", NA, NA, NA, NA, NA, NA,~
## $ vehicle_type_code_4           <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
## $ vehicle_type_code_5           <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
## $ hour                          <int> 2, 18, 21, 12, 16, 18, 15, 23, 15, 23, 1~
## $ weekday                       <ord> Thursday, Tuesday, Friday, Thursday, Thu~
## $ month                         <dbl> 6, 7, 6, 7, 6, 7, 7, 6, 6, 7, 7, 6, 7, 6~
dd = data %>% group_by(month, borough) %>% 
  summarise(
            ped = sum(number_of_pedestrians_injured),
            cyc = sum(number_of_cyclist_injured),
            mot = sum(number_of_motorist_injured)
            )
dd = dd %>% pivot_longer(c(-month, -borough))

ll = ggplot(dd,aes(x=month,
                   y=value,
                   fill=name, color = borough))+
  geom_bar(stat='identity',
           position=position_stack(reverse=T))+
  facet_wrap(.~borough, scales = "free_y")
ggplotly(ll)
dd = data %>% group_by(month, borough) %>% 
  summarise(
            ped = sum(number_of_pedestrians_injured),
            cyc = sum(number_of_cyclist_injured),
            mot = sum(number_of_motorist_injured)
            )
dd = dd %>% pivot_longer(c(-month, -borough))

ll = ggplot(dd,aes(x=month,
                   y=value,
                   fill=name, color = name))+
  geom_line()+
  facet_wrap(.~borough, scales = "free_y")
ggplotly(ll)
ff = data %>% group_by(month, borough) %>% 
  summarise(
            ped = sum(number_of_pedestrians_killed),
            cyc = sum(number_of_cyclist_killed),
            mot = sum(number_of_motorist_killed)
            )
ff = ff %>% pivot_longer(c(-month, -borough))

ll = ggplot(ff,aes(x=month,
                   y=value,
                   fill=name, color = borough))+
  geom_bar(stat='identity',
           position=position_stack(reverse=T))+
  facet_wrap(.~borough, scales = "free_y")
ggplotly(ll)
ff = data %>% group_by(month, borough) %>% 
  summarise(
            ped = sum(number_of_pedestrians_killed),
            cyc = sum(number_of_cyclist_killed),
            mot = sum(number_of_motorist_killed)
            )
ff = ff %>% pivot_longer(c(-month, -borough))

ll = ggplot(ff,aes(x=month,
                   y=value,
                   fill=name, color = name))+
  geom_line()+
  facet_wrap(.~borough, scales = "free_y")
ggplotly(ll)
library(plotly)
hg = raw %>% group_by(borough, contributing_factor_vehicle_1) %>% drop_na(borough) %>% summarise(count = n()) %>%ungroup() %>% 
  as.data.frame() %>% 
  arrange(desc(count))

hg = hg %>% arrange(desc(count)) %>% group_by(borough) %>% 
  summarise(factor = head(contributing_factor_vehicle_1),
            freq = head(count))
tt = ggplot(data=hg, aes(x= reorder(factor, freq), y = freq, color= factor))+
  geom_bar(stat="identity", fill = "#093146") +
  facet_wrap(.~borough, scales = "free", ncol = 1)+
  theme(axis.title.x = element_blank(),
          axis.title.y = element_blank())+
  coord_flip()
ggplotly(tt, height = 800, width=1000)
# plot_list = hg$borough %>% as.data.frame() %>% distinct()
# plot = list()
# for (i in 1:5){
# zs =
#   hg %>% filter(borough == plot_list[i, 1]) %>%
#  ggplot(aes(x= reorder(factor, freq), y = freq, color= factor))+
# geom_bar(stat="identity", fill = "#093146")+
#   coord_flip()
# 
# plot[[i]] = zs
# }
# for  (i in 1:5){
#   print(plot[i])
# }
hg = head(
  raw %>% group_by(contributing_factor_vehicle_1) %>% drop_na(contributing_factor_vehicle_1) %>% summarise(count = n()) %>%ungroup() %>% 
  as.data.frame() %>% 
  arrange(desc(count))
  )

tt = ggplot(data=hg, aes(x= reorder(contributing_factor_vehicle_1, count), y = count, color= contributing_factor_vehicle_1))+
  geom_bar(stat="identity", fill = "#093146") +
  theme(axis.title.x = element_blank(),
          axis.title.y = element_blank())+
  coord_flip()
ggplotly(tt)
hg = raw %>% group_by(borough, vehicle_type_code_1) %>% drop_na(borough) %>% summarise(count = n()) %>%ungroup() %>% 
  as.data.frame() %>% 
  arrange(desc(count))

hg = hg %>% arrange(desc(count)) %>% group_by(borough) %>% 
  summarise(factor = head(vehicle_type_code_1),
            freq = head(count))
tt = ggplot(data=hg, aes(x= reorder(factor, freq), y = freq, color= factor))+
  geom_bar(stat="identity", fill = "#093146") +
  facet_wrap(.~borough, scales = "free", ncol = 1)+
  theme(axis.title.x = element_blank(),
          axis.title.y = element_blank())+
  coord_flip()
ggplotly(tt, height = 800, width=1000)
  • Assess Data Provenance
  • Domain knowledge
  • Explore the data structure
  • Look for outliers and missing data

3 Research Question 1 - [INSERT QUESTION HERE]

Here you should

  • Address stakeholders
  • Wrangle your data to explore your research question
  • Create some visualisations
  • Provide a conclusion to the research question

4 Research Question 2 - [INSERT QUESTION HERE]

Here you should

  • Address stakeholders
  • Wrangle your data to explore your research question
  • Create some visualisations
  • Provide a conclusion to the research question

5 Reflection on Data Wrangling

Insert your reflection on how data wrangling helped you explore your research questions. (Don’t forget to adjust information at the top of report regarding your name in the author field etc!!)